github_dir <- "C:/Users/Karol/Desktop/Code for Github"
read_count_dir <- paste0(github_dir, "/Read_counts")
setwd(github_dir)
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, warning = FALSE, error=FALSE, echo=TRUE, cache=TRUE,
fig.width = 7, fig.height = 6, fig.align = 'left')
# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)
setwd(github_dir)
# Generate mm9 list of exon sizes
# Method description at: https://www.biostars.org/p/83901/
txdb <- makeTxDbFromGFF("G:/Shared drives/Nord Lab - Personal Folders/Karol/Genomes/Mus_musculus/UCSC/mm9/Annotation/genes.gtf", format="gtf")
exons.list.per.gene <- exonsBy(txdb,by="gene")
# I paralelized this increasing the speed >2x on a 4-core (logical) machine.
# IMPORTANT: Use mclapply instead of parLapply if you use a Mac.
cl <- makeCluster(detectCores())
exonic.gene.sizes <- parallel::parLapply(cl, exons.list.per.gene,function(x){sum(width(reduce(x)))})
stopCluster(cl)
# Reading and merging count tables
# These files contain P0 data
setwd(read_count_dir)
if (file.exists("Project_Nord_MERGED_HS674_mm9-50_fC0_UN.txt")==F) {
count.1 <- read.table("Project_Nord_L3_HS674_mm9-50_fC0_UN.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
count.2 <- read.table("Project_Nord_L7_HS674_mm9-50_fC0_UN.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
count.3 <- read.table("Project_ANIZ_L6_H674P_Zdilar_mm9-50_fC0_UN.out", sep="\t", header=T, as.is=T, stringsAsFactors=F)
count.3 <- count.3[,c(1,4,2,3,5:ncol(count.3))]
count.final <- count.1[,1:12]
count.final[,1] <- count.1[,1]
for (index in 2:12) {
count.final[,index] <- count.1[,index] + count.2[,index] + count.3[,index]
}
colnames(count.final) <- c("Gene", "MIA_P0_1.S44.L006", "MIA_P0_10.S53.L006", "MIA_P0_11.S54.L006", "MIA_P0_2.S45.L006", "MIA_P0_3.S46.L006", "MIA_P0_4.S47.L006", "MIA_P0_5.S48.L006","MIA_P0_6.S49.L006", "MIA_P0_7.S50.L006", "MIA_P0_8.S51.L006", "MIA_P0_9.S52.L006")
#write.table(count.final, "Project_Nord_MERGED_HS674_mm9-50_fC0_UN.txt", sep="\t", col.names=T, row.names=F, quote=F)
}
# This is processing E12.5 and E17.5 data
# Merge count data from split row
# Renames columns in the count matrix
#setwd("G:/Shared drives/Nord Lab - Data/MIAx00/RNAseq/Results")
if (file.exists("ALL_MATRIX_RENAMED.txt")==F) {
count.1 <- read.table("ALL_MATRIX.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
update.col.names <- colnames(count.1)
update.col.names <- gsub("X", "", update.col.names)
sample.id <- strsplit(update.col.names, "_")
e12.5.samples <- c("1623.1",
"1623.2",
"1635.4",
"1635.5",
"1644.3",
"1644.4",
"1646.3",
"1646.4",
"1819.2",
"1819.4",
"1819.5",
"1858.1")
e17.5.samples <- c("1631.1",
"1631.2",
"1641.4",
"1641.5",
"1712.1",
"1712.3",
"1823.1",
"1823.2",
"1826.3",
"1826.4",
"1842.11",
"1842.1")
for (index in 1:length(update.col.names)) {
if ((strsplit(update.col.names[index], "_")[[1]][1] %in% e12.5.samples) == T) {
update.col.names[index] <- gsub("_", "\\.", paste("MIA.e12.5.", gsub("_R1_001", "", update.col.names[index]), sep=""))
}
if ((strsplit(update.col.names[index], "_")[[1]][1] %in% e17.5.samples) == T) {
update.col.names[index] <- gsub("_", "\\.", paste("MIA.e17.5.", gsub("_R1_001", "", update.col.names[index]), sep=""))
}
}
# Quick sanity check looking at colnames before and after renaming
#data.frame(colnames(count.1), update.col.names)
count.final <- count.1
colnames(count.final) <- update.col.names
count.final <- count.final[,grep("Undetermined", update.col.names, invert = T)] # Removes "Undetermined" samples
count.final <- cbind(Gene=rownames(count.final), count.final)
#write.table(count.final, "ALL_MATRIX_RENAMED.txt", sep="\t", col.names=T, row.names=F, quote=F)
}
# Organizes sample.info
setwd(read_count_dir)
sample.info <- read.table("MIA.RNASeq.SampleInfo.20180710.txt", sep="\t", header=T, as.is=T, stringsAsFactors=F)
sample.info[,2] <- gsub("_", ".", sample.info[,2])
sample.info[,2] <- gsub("-", ".", sample.info[,2])
sample.info[, "CountFile"] <- gsub("ALL_MATRIX.txt", "ALL_MATRIX_RENAMED.txt", sample.info[, "CountFile"])
sample.info <- sample.info[order(as.character(sample.info[,2])),]
sample.info[,"DPC"] <- ifelse(sample.info[,"DPC"]=="21", 19.5,sample.info[,"DPC"])
# Reading and merging the remaining count tables - generates exp.data object
setwd(read_count_dir)
count.filenames <- unique(sample.info[,"CountFile"])
for (index in 1:length(count.filenames)) {
temp <- read.table(as.character(count.filenames[index]), sep="\t", header=T, as.is=T, stringsAsFactors = F)
named.cols <- max(grep("MIA", colnames(temp)))
temp <- temp[,1:named.cols]
if (index == 1) {
exp.data <- temp
colnames(exp.data)[1] <- "Gene"
} else {
split.id <- colnames(temp)[1]
exp.data <- merge(exp.data, temp, by.x="Gene", by.y=split.id)
}
}
colnames(exp.data) <- gsub("_", ".", colnames(exp.data))
colnames(exp.data) <- gsub("-", ".", colnames(exp.data))
exp.data <- exp.data[,order(colnames(exp.data))]
rownames(exp.data) <- exp.data[,1]
exp.data <- exp.data[,2:ncol(exp.data)]
A threshold of 1000 counts of Xist transcript was used to identify sex
# I'm checking if the 1000 count threshorld for Xist makes sense. It does.
# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #
Xist_exp <- as.data.frame(melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")
ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
geom_jitter()+
geom_boxplot(alpha=0.2)+
theme_bw()+
labs(title="Xist read counts")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))
# Samples marked as outliers were removed.
# Identifies outliers
outlier <- ifelse(sample.info[,"Outlier"]=="1",1,0) #Samples marked with 1 are Outliers
outlier <- ifelse(is.na(sample.info[,"Outlier"]),0,outlier) #
#Preserving data with outliers
exp.data_w_outliers <- exp.data
sample.info_w_outliers <- sample.info
# Removes the outliers from the dataset
exp.data <- exp.data[,which(outlier==0)] #Filters exp.data columns(samples) which are not outliers
sample.info <- sample.info[which(outlier==0),] #It does the same for the rows of sample info
# Assigns group values 1 for Saline; 2 for PolyIC, 3 for Inhibitor
group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
group <- ifelse(sample.info[,"Condition"]=="Inhibitor",3,group) #
#group <- ifelse(sample.info[,"Condition"]=="unknown",4,group)
group <- factor(group)
# Assigns sex value, 1 for Male, 2 for Female and 3 for unknown.
# There may be a logical error in this assignment because there are 4 symbols deliminating sex "M" "F" "?" ""
# Therefore ?" "" get assigned the value of 2.
#IMPORTANT: Check if for the DE expression sex is called from the RNA or from this column.
#ANSWER: No. Sex in the analysis is called from the Xist RNA expression. This sex calling is done just pro forma.
sex <- ifelse(sample.info[,"Sex"]=="M",1,2)
sex <- ifelse(sample.info[,"Sex"]=="unknown",3,sex)
sex <- factor(sex)
#Adds sex.by.rna and to sample.info. Double check if there wans't an error with assigning sex by RNA into the sample.info data.
sample.info <- data.frame(sample.info, sex_by_rna = sex.by.rna[which(outlier!=1)])
sample.info <- cbind(sample.info, ExperimentalDesign=paste(sample.info[,"DPC"], sample.info[,"Condition"], sep="_"))
# Writes updated sample.infor and exp.data to files
#write.table(sample.info, "UpdatedSampleInfo.txt", sep="\t", col.names=T, row.names=F)
#write.table(cbind(Gene=rownames(exp.data), exp.data), "MIA_GENE_MATRIX.txt", sep="\t", col.names=T, row.names=F)
# Renames columns in sample.info - housekeeping
info.col.names <- colnames(sample.info)
sample.info <- sample.info[,c(2,1,3:ncol(sample.info))]
colnames(sample.info) <- info.col.names
# And writes another file...
#write.table(sample.info, "MIA_SAMPLE_INFO.txt", sep="\t", col.names=T, row.names=F, quote=F)
# Writes a sample.info file lacking the inhibitor and lane 12 samples
#write.table(sample.info[which(group!=3 & sample.info[,"Lane"]!=12),], "MIA_SAMPLE_INFO_POLYIC.txt", sep="\t", col.names=T, row.names=F, quote=F)
#write.table(cbind(Gene=rownames(exp.data), exp.data), "MIA_GENE_MATRIX.txt", sep="\t", col.names=T, row.names=F, quote=F)
#datatable(sample.info, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
# Generates vectors/factors from the outlier-filtered dataset for the analysis
sex.by.rna <- ifelse(exp.data["Xist",]>1000,"2","1")
sex.by.rna <- factor(sex.by.rna)
response <- ifelse(sample.info[,"Response"]=="med",1,2)
response <- factor(response)
#unknown.batch <- ifelse(sample.info[,"ControlOutlier"]=="0",1,2)
#unknown.batch <- factor(unknown.batch)
lane <- factor(sample.info[,"Lane"])
dpc <- sample.info[,"DPC"]
dpc <- ifelse(dpc=="P0", 19.5, dpc)
dpc <- ifelse(dpc=="21", 19.5, dpc) # Why??
dpc <- ifelse(dpc=="e14.5", 14.5, dpc)
dpc <- ifelse(dpc=="e12.5", 12.5, dpc)
dpc <- ifelse(dpc=="e17.5", 17.5, dpc)
dpc <- as.factor(dpc)
rRNA <- exp.data["Rn45s",]/colSums(exp.data) #45S rRNA serves as the precursor for the 18S, 5.8S and 28S rRNA. It's used as a normalization factor
colnames(exp.data) <- paste(dpc, group, sex.by.rna, response, lane, sample.info[,"Library"], sep="_") #Interesting encoding
exp.original <- exp.data
# exp.data colnames encoding
colnames(exp.data) <- paste(dpc, group, sex.by.rna, response, lane, sample.info[,"Library"], sep="_")
# Calculates RPKM values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
# edgeR settings
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
# Removes batch effect associated with lane of sequencing. "design: optional design matrix relating to treatment conditions to be preserved". I think this is may be incorrect since the exp.data contains raw counts.Run this on rpkm data instead.
rpkm.batch <- removeBatchEffect(exp.data, batch=lane, design=cbind(dpc, group, sex.by.rna, response))
### Removes genes with low expression ###
# Lets plot a histogram of all rpkm values in a dataset.
# dim(rpkm.data) #24015 rows and 102 columns
df <- melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")
# Histogram of all rpkm values in a dataset.
#ggplot(df, aes(x=rpkm_log2)) + geom_histogram(bins = 1000, color="black")+theme_bw()+labs(title="Complete dataset of 24015 genes", x="log2 RPKMsfrom all samples")
# Setting the threshold for minimum rpkm value in at least 2 samples.
threshold = -2
# That translates to 17195 genes out of a total of 24015
#sum(melt(rowSums(rpkm.data > threshold) >= 2)$value)
# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name
#
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)
# Filters the "keep" genes
datExpr_test <- filter(datExpr_test, gene_name %in% keep)
# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL
# Overwrites the original datExpr object.
datExpr <- datExpr_test
# Plots the histogram after filtering
df <- melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")
#ggplot(df, aes(x=rpkm_log2)) + geom_histogram(bins = 1000, color="black")+theme_bw()+labs(title=paste("After filtering", "n of genes" , nrow(datExpr),".", "Threshold in at least 2 samples >", threshold))
# Removing low expression genes. Overwrites exp.data object used for DE analysis ###
#dim(exp.data) #exp.data contains counts, datExpr contains rpkms
exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))
#dim(exp.data)
#head(exp.data)
rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL
#dim(exp.data)
#head(exp.data)
# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
datatable(sample.info[,setdiff(c(1:23), c(7, 8, 14, 15, 16, 18, 21:23))],
rownames = FALSE,
filter = list(position = 'top', clear = FALSE, plain = FALSE),
options = list(pageLength = 10, scrollX=T, scrollY=T))
setwd(github_dir)
source("Dimensionality reduction plots.r")
grid.arrange(PCA_plot_without_inhibitor_and_lane_12, PCA_plot_without_inhibitor_and_lane_12_sex, PCA_plot_w_inh, ncol = 3)
control.datapoints <- intersect(which(group=="1"), which(lane != "12"))
control.datapoints <- intersect(control.datapoints, which(rRNA < 0.01))
#control.datapoints <- intersect(control.datapoints, which(response == 1)) # keeping only medium responders doesn't preserve enought data
polyic.datapoints <- intersect(which(group=="2"), which(lane != "12"))
polyic.datapoints <- intersect(polyic.datapoints, which(rRNA < 0.01))
#polyic.datapoints <- intersect(polyic.datapoints, which(response == 1))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)
#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
new.control.datapoints <- which(use.cols %in% control.datapoints)
new.polyic.datapoints <- which(use.cols %in% polyic.datapoints)
train.data <- data.frame(PCA.1=pca.results$rotation[new.control.datapoints,1],
PCA.2=pca.results$rotation[new.control.datapoints,2],
PCA.3=pca.results$rotation[new.control.datapoints,3],
PCA.4=pca.results$rotation[new.control.datapoints,4],
PCA.5=pca.results$rotation[new.control.datapoints,5],
rRNA=as.numeric(rRNA[new.control.datapoints]),
sex=sex.by.rna[new.control.datapoints],
lane=lane[new.control.datapoints],
response=test.response[new.control.datapoints]
)
predict.data <- data.frame(PCA.1=pca.results$rotation[new.polyic.datapoints,1],
PCA.2=pca.results$rotation[new.polyic.datapoints,2],
PCA.3=pca.results$rotation[new.polyic.datapoints,3],
PCA.4=pca.results$rotation[new.polyic.datapoints,4],
PCA.5=pca.results$rotation[new.polyic.datapoints,5],
rRNA=as.numeric(rRNA[new.polyic.datapoints]),
sex=sex.by.rna[new.polyic.datapoints],
lane=lane[new.polyic.datapoints],
response=test.response[new.polyic.datapoints]
)
lm.model <- lm(as.numeric(as.character(dpc[control.datapoints])) ~
PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + rRNA + sex + as.numeric(lane) + response
, data = train.data
)
# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)
# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(dpc[control.datapoints])==unique(dpc)[x])]))
dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(dpc[control.datapoints])==unique(dpc)[x])]))
dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(dpc[polyic.datapoints])==unique(dpc)[x])]))
dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(dpc[polyic.datapoints])==unique(dpc)[x])]))
dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)
lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)
####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(dpc[control.datapoints])),
"Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))
df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(dpc[polyic.datapoints])),
"Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))
df_combined <- rbind(df_1, df_2)
mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
)
mean_data <- as.data.frame(mean_data)
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))
Fig_4c <- ggplot()+
geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC, color=Condition), size = 2, height= 0.3, alpha=0.7)+
geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
theme_bw()+
scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
labs(y = "Predicted DPC", x = "Actual DPC")+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Fig_4c
19.5 was used as a numerical representation of the P0 time point.
#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)
##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])
t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])
t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])
t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])
t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)
predicted_DPC_df <- melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))
t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))
knitr::kable(predicted_DPC_df_nice)
| Real_DPC | Predicted_DPC_Saline | Predicted_DPC_PolyIC | t_test_p_value |
|---|---|---|---|
| 12.5 | 12.78274 | 12.46882 | 0.46920042 |
| 14.5 | 14.79024 | 15.57854 | 0.07329140 |
| 17.5 | 16.72892 | 18.72772 | 0.00000456 |
| 19.5 | 19.38742 | 19.26204 | 0.65641544 |
single_timepoint_glm_function<- function(x){
control.datapoints <- intersect(which(group=="1"), which(lane != "12"))
control.datapoints <- intersect(control.datapoints, which(rRNA < 0.01))
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(which(group=="2"), which(lane != "12"))
polyic.datapoints <- intersect(polyic.datapoints, which(rRNA < 0.01))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)
E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))
E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))
E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))
E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))
###
E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))
E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))
E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))
E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))
DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
"E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
"E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
"P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))
colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")
DE_df_n_melted <- melt(DE_df_n)
DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))
DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)
Fig2c <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
geom_bar(position = "dodge", stat="identity", color="black")+
theme_bw()+
#scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
theme(legend.title=element_blank())+
labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
coord_flip()+
scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
theme(panel.border = element_blank(),
#legend.key = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.text.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position="bottom")
Fig2c
volcano_plot_text_2 <- function(x, title) {
#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
################# Version capped at different y axis levels ###############
volcano_plot_text_3 <- function(x, title) {
#This is nicely dealing with datasets with missing cathegories.
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
plotDat$log10_PValue <- -log10(plotDat$PValue)
plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)
#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+
p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
expand_limits(x = c(-4.2, 4.2))+
coord_cartesian(ylim = c(0, 8))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
p <- volcano_plot_text_3(filter(E12.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E12.5 GLM DE analysis")
p
datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(filter(E14.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E14.5 GLM DE analysis")
p
datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_2(filter(E17.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "E17.5 GLM DE analysis")
p
datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(filter(E19.5_GLM, logFC > 1 | logFC < -1 | PValue < 0.05), "P0 GLM DE analysis")
p
datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))